import json
import random
import time
import pymysql
import requests
import re
from concurrent.futures import ThreadPoolExecutor

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

connection = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456', db='jobinformation',
                             charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)


def parse(url):
    # time.sleep(random.randint(1, 6))
    response = requests.get(url, headers=headers)
    s = re.findall(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', response.text, re.S)
    data = json.loads(s[0])
    cursor = connection.cursor()
    try:
        for i in list(data['props']['pageProps']['initData']['content']['positionResult']['result']):
            industryField = str(i['industryField']).replace(',', '/')
            positionDetail = (str(i['positionDetail']).replace('\n', '').replace('<br />', '')
                              .replace('<p>', '').replace('</p>', ''))
            companyLabelList = '，'.join([str(i) for i in list(i['companyLabelList'])])
            sql = ("INSERT INTO `lagou_table_1` VALUES ('{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}')"
                   .format(i['positionId'], i['companyFullName'], i['positionAddress'], i['salary'],
                           i['education'], i['workYear'], industryField, positionDetail, i['positionName'],
                           i['city'], i['companySize'], companyLabelList))
            print(i['positionId'], i['companyFullName'], i['positionAddress'], i['salary'],
                  i['education'], i['workYear'], industryField, positionDetail, i['positionName'],
                  i['city'], i['companySize'], companyLabelList)
            try:
                cursor.execute(sql)
            except:
                pass

        connection.commit()
        cursor.close()
    except:
        pass


if __name__ == "__main__":
    # urls = ['https://www.lagou.com/wn/zhaopin?pn={}'.format(i) for i in range(1, 68)]
    # pool = ThreadPoolExecutor(max_workers=5)
    # pool.map(parse, urls)

    # 总67页
    for i in range(1, 68):
        base_url = ('https://www.lagou.com/wn/zhaopin?pn={}&'
                    'hy=IT技术服务｜咨询%2C营销服务｜咨询%2C人工智能服务%2C软件服务｜咨询%2C数据服务｜咨询%2C信息安全%2C信息检索%2C物联网%2C区块链%2C网络通信').format(i)
        parse(base_url)
        time.sleep(random.randint(1, 6))
    connection.close()
